import pandas as pdimport plotly.express as pximport plotly.io as piofrom pyspark.sql import SparkSessionimport reimport numpy as npimport plotly.graph_objects as gofrom pyspark.sql.functions import col, split, explode, regexp_replace, transform, whenfrom pyspark.sql import functions as Ffrom pyspark.sql.functions import col, monotonically_increasing_idnp.random.seed(42)pio.renderers.default ="notebook"# Initialize Spark Sessionspark = SparkSession.builder.appName("LightcastData").getOrCreate()# Load Datadf = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")df.createOrReplaceTempView("job_postings")# Show Schema and Sample Data#print("---This is Diagnostic check, No need to print it in the final doc---")#df.printSchema() # comment this line when rendering the submission#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/18 02:55:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 1:> (0 + 1) / 1] 25/10/18 02:55:46 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
2 Data Cleaning
from pyspark.sql.functions import coldf = df.withColumn("SALARY", col("SALARY").cast("float"))df = df.withColumn("SALARY_FROM", col("SALARY_FROM").cast("float"))df = df.withColumn("SALARY_TO", col("SALARY_TO").cast("float"))df = df.withColumn("MIN_YEARS_EXPERIENCE", col("MIN_YEARS_EXPERIENCE").cast("float"))df = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))# Compute median salarymedian_from = df.approxQuantile("SALARY_FROM", [0.5], 0.01)[0]median_to = df.approxQuantile("SALARY_TO", [0.5], 0.01)[0]median_salary = df.approxQuantile("SALARY", [0.5], 0.01)[0]print("Medians:",median_from, median_to, median_salary)# Impute missing 'SALARY_FROM' and 'SALARY_TO' with their mediansdf = df.fillna({"SALARY_FROM": median_from,"SALARY_TO": median_to,"SALARY": median_salary})# Compute 'AVERAGE_SALARY'df = df.withColumn("AVERAGE_SALARY", (col("SALARY_FROM") + col("SALARY_TO")) /2)# Impute missing 'SALARY' with AVERAGE_SALARY, and if that's missing, with the median salaryfrom pyspark.sql.functions import whendf = df.withColumn("SALARY", when( col("SALARY").isNull(), when(col("AVERAGE_SALARY").isNotNull(), col("AVERAGE_SALARY")) .otherwise(median_salary) ).otherwise(col("SALARY")))from pyspark.sql.functions import regexp_replacedf = df.withColumn("EDUCATION_LEVELS_NAME", regexp_replace(col("EDUCATION_LEVELS_NAME"), r'[\n\r]', ''))# Overwritedf.write.option("header", True).mode("overwrite").csv("data/lightcast_job_postings_cleaned.csv")# Display row countprint(f"Rows retained after cleaning: {df.count()}")
Salaries vary widely between industries, with sectors like Information and Finance & Insurance generally showing higher salary ranges than industries such as Accommodation and Food Services. Full-time positions tend to have higher median salaries across most industries compared to part-time or other employment types.
4 Salary Analysis by Occupation (Bubble Chart)
# Lot Occupation Namesalary_analysis = spark.sql(""" SELECT LOT_OCCUPATION_NAME AS OCCUPATION_NAME, PERCENTILE(SALARY, 0.5) AS Median_Salary, COUNT(*) AS Job_Postings FROM job_postings GROUP BY LOT_OCCUPATION_NAME ORDER BY Job_Postings DESC LIMIT 10""")salary_pd = salary_analysis.toPandas()salary_pd.head()import plotly.express as pxfig = px.scatter( salary_pd, x="OCCUPATION_NAME", y="Median_Salary", size="Job_Postings", title="Salary Analysis by LOT Occupation Type (Bubble Chart)", labels={"OCCUPATION_NAME": "LOT Occupation","Median_Salary": "Median Salary","Job_Postings": "Number of Job Postings" }, hover_name="OCCUPATION_NAME", size_max=60, width=1000, height=600, color="Job_Postings", color_continuous_scale="Viridis",)# Layout Customizationfig.update_layout( title={'text': "Salary Analysis by Occupation Type (Bubble Chart)",'x': 0.5,'xanchor': 'center', }, font_family="Arial", font_size=14, title_font_size=25, xaxis_title="LOT Occupation", yaxis_title="Median Salary", plot_bgcolor="#f6f9fa", width=1000, height=600, xaxis=dict( tickangle=-15, showline=True, linecolor="#444" ), yaxis=dict( showline=True, linecolor="#444" ), xaxis_title_font=dict(size=17), yaxis_title_font=dict(size=17),)fig.show()fig.write_image("figures/salary_by_occupation.png", scale=2)
[Stage 10:> (0 + 1) / 1]
The bubble chart reveals that certain occupations have much higher job posting volumes, while also revealing a broad range of median salaries. High-demand roles tend to cluster at higher salary levels, indicating strong competition for talent in these areas.
5 Salary by Education Level
# Education levelslower_deg = ["Bachelor", "Associate", "GED", "No Education Listed", "High school"]higher_deg = ["Master", "PhD", "Doctorate", "professional degree"]# Add EDU_GROUP columndf = df.withColumn("EDU_GROUP", when( col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in lower_deg])),"Bachelor's or lower" ).when( col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in higher_deg])),"Master's or PhD" ).otherwise("Other"))# Cast columnsdf = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))df = df.withColumn("Average_Salary", col("Average_Salary").cast("float"))# Filter for non-null and positive valuesdf = df.filter( (col("MAX_YEARS_EXPERIENCE").isNotNull()) & (col("Average_Salary").isNotNull()) & (col("MAX_YEARS_EXPERIENCE") >0) & (col("Average_Salary") >0))# Filter for education groupsdf_filtered = df.filter( col("EDU_GROUP").isin("Bachelor's or lower", "Master's or PhD"))# Convert to Pandasdf_pd = df_filtered.toPandas()df_pd.head()# Scatter plot: Experience vs. Salary by Education Groupimport plotly.express as px# Plotfig1 = px.scatter( df_pd, x="MAX_YEARS_EXPERIENCE", y="Average_Salary", color="EDU_GROUP", hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"], title="<b>Experience vs Salary by Education Level</b>", opacity=0.7, color_discrete_sequence=["#36B37E", "#A259EC"] # Custom green & purple)# Add bordersfig1.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))# Update layoutfig1.update_layout( plot_bgcolor="#f9f9f9", paper_bgcolor="#EAF7FF", # Softer blue background font=dict(family="Segoe UI", size=14), title_font=dict(size=22), xaxis_title="Years of Experience", yaxis_title="Average Salary (USD)", legend_title="Education Group", hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"), margin=dict(t=70, b=60, l=60, r=60), xaxis=dict( gridcolor="lightgrey", tickmode="linear", dtick=1, zeroline=False ), yaxis=dict( gridcolor="lightgrey", zeroline=False ))fig1.show()fig.write_image("figures/salary_by_education.png", scale=2)
[Stage 13:> (0 + 1) / 1]
Salaries generally rise with years of experience, regardless of education level. In this dataset though, the “bachelor’s or lower” group actually shows a wider spread and sometimes even higher salaries than the master’s or PhD group. This could be because some top-paying roles in fields like tech, sales, or management don’t always require advanced degrees, or because the “bachelor’s or lower” category includes a lot of jobs where the education isn’t specifically listed, so there’s a lot of variation in pay.
6 Salary by Remote Work Type
# Categorize remote work typesdf = df.withColumn("REMOTE_GROUP",when(col("REMOTE_TYPE_NAME") =="Remote", "Remote").when(col("REMOTE_TYPE_NAME") =="Hybrid", "Hybrid").otherwise("Onsite"))# Ensure correct typesdf = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))df = df.withColumn("Average_Salary", col("Average_Salary").cast("float"))# Filter out invalid rowsdf_remote = df.filter((col("MAX_YEARS_EXPERIENCE").isNotNull()) &(col("Average_Salary").isNotNull()) &(col("MAX_YEARS_EXPERIENCE") >0) &(col("Average_Salary") >0))# Convert to pandasdf_remote_pd = df_remote.select("MAX_YEARS_EXPERIENCE","Average_Salary","LOT_V6_SPECIALIZED_OCCUPATION_NAME","REMOTE_GROUP").toPandas()# Define colorsremote_palette = {"Onsite": "#60A5FA", # blue"Remote": "#E15759", # red"Hybrid": "#7CCBA2"# green}import plotly.express as px# Scatter plot: Experience vs Salary by Remote Work Typefig_remote = px.scatter(df_remote_pd,x="MAX_YEARS_EXPERIENCE",y="Average_Salary",color="REMOTE_GROUP",color_discrete_map=remote_palette,opacity=0.7,hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],title="<b>Experience vs Salary by Remote Work Type</b>")fig_remote.update_traces(marker=dict(size=7, line=dict(width=1, color="black")))fig_remote.update_layout(plot_bgcolor="#F7FFF9",paper_bgcolor="#E9FFF0",font=dict(family="Segoe UI", size=14),title_font=dict(size=22),xaxis_title="Years of Experience",yaxis_title="Average Salary (USD)",legend_title="Remote Work Type",hoverlabel=dict(bgcolor="white", font_size=13, font_family="Arial"),margin=dict(t=70, b=60, l=60, r=60),xaxis=dict(gridcolor="lightgrey",tickmode="linear",dtick=1,zeroline=False),yaxis=dict(gridcolor="lightgrey",zeroline=False))fig_remote.show()fig.write_image("figures/salary_by_remote_type.png", scale=2)
[Stage 14:> (0 + 1) / 1]
The scatter plot shows that average salary generally increases with years of experience for both remote and onsite positions. There is no dramatic separation between the two work types, but remote roles appear somewhat more concentrated at mid-level experience, while onsite roles are distributed more broadly across the experience range.
The salary distribution for onsite roles is tightly clustered, with most job postings offering salaries around the $100,000 mark. There are relatively few onsite positions at significantly higher salary levels, and the distribution drops off sharply past $150,000.
The salary distribution for remote roles is also centered near $100,000, but it shows a broader spread of salaries compared to onsite roles. There are slightly more remote postings with higher and lower salary extremes, suggesting greater variability in remote job pay.